{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:25:59.483554Z",
     "start_time": "2017-06-19T01:25:58.780678Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "pd.set_option(\"display.max_rows\",15)\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:25:59.543687Z",
     "start_time": "2017-06-19T01:25:59.490292Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class dataset:\n",
    "    kdd_train_2labels = pd.read_pickle(\"dataset/kdd_train_2labels_20percent.pkl\")\n",
    "    kdd_train_2labels_y = pd.read_pickle(\"dataset/kdd_train_2labels_y_20percent.pkl\")\n",
    "    \n",
    "    kdd_test_2labels = pd.read_pickle(\"dataset/kdd_test_2labels_20percent.pkl\")\n",
    "    kdd_test_2labels_y = pd.read_pickle(\"dataset/kdd_test_2labels_y_20percent.pkl\")\n",
    "    \n",
    "    kdd_train_5labels = pd.read_pickle(\"dataset/kdd_train_5labels_20percent.pkl\")\n",
    "    kdd_train_5labels_y = pd.read_pickle(\"dataset/kdd_train_5labels_y_20percent.pkl\")\n",
    "    \n",
    "    kdd_test_5labels = pd.read_pickle(\"dataset/kdd_test_5labels_20percent.pkl\")\n",
    "    kdd_test_5labels_y = pd.read_pickle(\"dataset/kdd_test_5labels_y_20percent.pkl\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:26:00.120803Z",
     "start_time": "2017-06-19T01:25:59.551263Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import collections\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
    "from sklearn.svm import NuSVC, SVC\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.model_selection import ShuffleSplit\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn import preprocessing\n",
    "from sklearn import metrics\n",
    "from sklearn.decomposition import SparsePCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:26:00.134795Z",
     "start_time": "2017-06-19T01:26:00.126962Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pca = SparsePCA(n_components=40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:45:36.581969Z",
     "start_time": "2017-06-19T01:26:00.145317Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "names_noscaling = [\"Decision Tree\", \"Random Forest\"]\n",
    "classifiers_noscaling = [\n",
    "                DecisionTreeClassifier(),\n",
    "                RandomForestClassifier()\n",
    "                ]\n",
    "\n",
    "names_withscaling = [\"SVC\", \"Non - Linear SVM\", \"AdaBoost\", \"Naive Bayes\"]\n",
    "classifiers_withscaling = [SVC(), NuSVC(),\n",
    "                AdaBoostClassifier(), GaussianNB()]\n",
    "\n",
    "score = collections.namedtuple(\"score\", [\"name\", \"valid_score\" ,\"test_score\"])\n",
    "scores = []\n",
    "cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)\n",
    "\n",
    "x_train, y_train = dataset.kdd_train_2labels.iloc[:,:-2], dataset.kdd_train_2labels_y\n",
    "x_test, y_test = dataset.kdd_test_2labels.iloc[:,:-2], dataset.kdd_test_2labels_y\n",
    "\n",
    "x_train = pca.fit_transform(x_train)\n",
    "x_test = pca.fit_transform(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:45:36.599681Z",
     "start_time": "2017-06-19T01:45:36.589912Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25192, 40)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:45:36.610887Z",
     "start_time": "2017-06-19T01:45:36.606039Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11850, 40)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:57:26.519728Z",
     "start_time": "2017-06-19T01:45:36.613026Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classifier: Decision Tree\n",
      "Score: 0.5959493670886076 \n",
      "\n",
      "Classifier: Random Forest\n",
      "Score: 0.6254852320675105 \n",
      "\n",
      "Classifier: SVC\n",
      "Score: 0.6270886075949367 \n",
      "\n",
      "Classifier: Non - Linear SVM\n",
      "Score: 0.8183966244725739 \n",
      "\n",
      "Classifier: AdaBoost\n",
      "Score: 0.6545147679324894 \n",
      "\n",
      "Classifier: Naive Bayes\n",
      "Score: 0.8183966244725739 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "for name, clf in zip(names_noscaling, classifiers_noscaling):\n",
    "    print(\"Classifier: {}\".format(name))\n",
    "    \n",
    "    clf_p = make_pipeline(clf)    \n",
    "    valid_score = cross_val_score(clf_p, x_train, y_train, cv=cv)\n",
    "    \n",
    "    clf.fit(x_train, y_train)\n",
    "    y_pred = clf.predict(x_test)\n",
    "    test_acc = metrics.accuracy_score(y_test, y_pred) \n",
    "    \n",
    "    scores.append(score(name, valid_score.mean(), test_acc))\n",
    "    print(\"Score: {} \\n\".format(test_acc))\n",
    "\n",
    "for name, clf in zip(names_withscaling, classifiers_withscaling):\n",
    "    print(\"Classifier: {}\".format(name))\n",
    "    \n",
    "    clf_p = make_pipeline(preprocessing.StandardScaler(), clf)    \n",
    "    valid_score = cross_val_score(clf_p, x_train, y_train, cv=cv)\n",
    "    \n",
    "    scaler = preprocessing.StandardScaler().fit(x_train)\n",
    "    clf.fit(scaler.transform(x_train), y_train)\n",
    "    y_pred = clf.predict(scaler.transform(x_test))\n",
    "    test_acc = metrics.accuracy_score(y_test, y_pred) \n",
    "    \n",
    "    scores.append(score(name, valid_score.mean(), test_acc))\n",
    "    print(\"Score: {} \\n\".format(test_acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:57:26.533674Z",
     "start_time": "2017-06-19T01:57:26.521636Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>valid_score</th>\n",
       "      <th>test_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Decision Tree</td>\n",
       "      <td>0.994761</td>\n",
       "      <td>0.595949</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Random Forest</td>\n",
       "      <td>0.995614</td>\n",
       "      <td>0.625485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SVC</td>\n",
       "      <td>0.990633</td>\n",
       "      <td>0.627089</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Non - Linear SVM</td>\n",
       "      <td>0.924886</td>\n",
       "      <td>0.818397</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AdaBoost</td>\n",
       "      <td>0.989720</td>\n",
       "      <td>0.654515</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Naive Bayes</td>\n",
       "      <td>0.923536</td>\n",
       "      <td>0.818397</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               name  valid_score  test_score\n",
       "0     Decision Tree     0.994761    0.595949\n",
       "1     Random Forest     0.995614    0.625485\n",
       "2               SVC     0.990633    0.627089\n",
       "3  Non - Linear SVM     0.924886    0.818397\n",
       "4          AdaBoost     0.989720    0.654515\n",
       "5       Naive Bayes     0.923536    0.818397"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-06-19T01:57:26.572890Z",
     "start_time": "2017-06-19T01:57:26.535505Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Naive Bayes"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
